qol_caret

Code
library(tidyverse)
library(ggplot2)
library(lavaan)
library(car)
library(caret)
library(ranger)
library(doParallel)

Data set

This data set is from the 2015 Asian American Quality of Life survey. Participants are from Austin, Texas.

Input data set

Code
qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |> 
  mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
         `English Speaking`=relevel(`English Speaking`,ref="Not at all"),
         Ethnicity = relevel(Ethnicity,ref="Chinese"),
         Religion=relevel(Religion,ref="None")) |> 
  mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
                                         "$10,000 - $19,999" ~"Below",
                                         "$20,000 - $29,999"~"Below",
                                         "$30,000 - $39,999"~"Below",
                                         "$40,000 - $49,999"~"Below",
                                         "$50,000 - $59,999"~"Below",
                                         "$60,000 - $69,999"~"Above",
                                         "$70,000 and over"~"Above",
                                          .default=Income)) |> 
  mutate(Income_median = factor(Income_median, levels=c("Below","Above"))) |> 
  mutate(across(`Familiarity with America`:`Familiarity with Ethnic Origin`,~factor(.x,levels=c("Very low","Low", "High", "Very high"))),
         across(`Identify Ethnically`,~factor(.x,levels=c("Not at all","Not very close","Somewhat close","Very close"))),
         across(`Belonging`,~factor(.x,levels=c("Not at all","Not very much","Somewhat","Very much"))),
         `Primary Language` = as.factor(`Primary Language`))
New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
Code
qol |> DT::datatable()
Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html

Physical Check-up

Code
#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Physical Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

inTraining <- createDataPartition(rfdata$Physical.Check.up, p = .8, list = FALSE)
training <- rfdata[ inTraining,]
testing  <- rfdata[-inTraining,]

fitControl <- trainControl(## 10-fold CV
                           method = "repeatedcv",
                           number = 10,
                           ## repeated ten times
                           repeats = 10)


cl <- makePSOCKcluster(10)
registerDoParallel(cl)
set.seed(825)
rf_fit <- train(Physical.Check.up ~ ., data = training, 
                 method = "rf", 
                 trControl = fitControl,
                 ## This last option is actually one
                 ## for gbm() that passes through
                 verbose = FALSE)

stopCluster(cl)
varImp(rf_fit)
rf variable importance

  only 20 most important variables shown (out of 39)

                                        Overall
Duration.of.Residency                   100.000
Age                                      84.561
Health.InsuranceYes                      36.124
Dental.InsuranceYes                      35.967
Income_medianAbove                       22.141
GenderMale                               18.296
EmploymentEmployed full time             13.234
EnglishDiffNot much                      12.709
Discrimination                           12.332
Primary.Language1                        11.846
Familiarity.with.Ethnic.OriginHigh       11.255
EnglishSpeakVery well                    10.775
BelongingSomewhat                        10.739
EthnicityKorean                          10.614
Identify.EthnicallyVery close            10.303
Familiarity.with.Ethnic.OriginVery high  10.108
Familiarity.with.AmericaLow              10.000
Familiarity.with.AmericaHigh              9.953
EnglishSpeakWell                          9.803
Identify.EthnicallySomewhat close         9.520
Code
predict(rf_fit, newdata=testing) -> pc_pred

confusionMatrix(data=pc_pred,reference=testing$Physical.Check.up)
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0    32   7
       Yes  94 260
                                          
               Accuracy : 0.743           
                 95% CI : (0.6968, 0.7855)
    No Information Rate : 0.6794          
    P-Value [Acc > NIR] : 0.003544        
                                          
                  Kappa : 0.2785          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.25397         
            Specificity : 0.97378         
         Pos Pred Value : 0.82051         
         Neg Pred Value : 0.73446         
             Prevalence : 0.32061         
         Detection Rate : 0.08142         
   Detection Prevalence : 0.09924         
      Balanced Accuracy : 0.61388         
                                          
       'Positive' Class : 0               
                                          

Dental Check-up

Code
#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Dentist Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

inTraining <- createDataPartition(rfdata$Dentist.Check.up, p = .8, list = FALSE)
training <- rfdata[ inTraining,]
testing  <- rfdata[-inTraining,]

fitControl <- trainControl(## 10-fold CV
                           method = "repeatedcv",
                           number = 10,
                           ## repeated ten times
                           repeats = 10)


cl <- makePSOCKcluster(10)
registerDoParallel(cl)
set.seed(825)
rf_fit <- train(Dentist.Check.up ~ ., data = training, 
                 method = "rf", 
                 trControl = fitControl,
                 ## This last option is actually one
                 ## for gbm() that passes through
                 verbose = FALSE)

stopCluster(cl)
varImp(rf_fit)
rf variable importance

  only 20 most important variables shown (out of 39)

                                        Overall
Duration.of.Residency                   100.000
Age                                      60.759
Dental.InsuranceYes                      56.719
Income_medianAbove                       25.238
Health.InsuranceYes                      20.080
GenderMale                               14.996
EthnicityAsian Indian                    13.683
EmploymentEmployed full time             12.474
Primary.Language1                        11.544
EnglishSpeakVery well                    11.352
ReligionHindu                            11.042
Discrimination                           10.731
Familiarity.with.AmericaLow              10.302
EnglishDiffVery much                      9.654
Identify.EthnicallySomewhat close         9.617
BelongingSomewhat                         9.083
Familiarity.with.Ethnic.OriginHigh        8.995
ReligionCatholic                          8.614
Familiarity.with.AmericaHigh              8.484
Familiarity.with.Ethnic.OriginVery high   8.464
Code
predict(rf_fit, newdata=testing) -> pc_pred

confusionMatrix(data=pc_pred,reference=testing$Dentist.Check.up)
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0    91  31
       Yes  71 199
                                          
               Accuracy : 0.7398          
                 95% CI : (0.6934, 0.7826)
    No Information Rate : 0.5867          
    P-Value [Acc > NIR] : 1.796e-10       
                                          
                  Kappa : 0.4431          
                                          
 Mcnemar's Test P-Value : 0.0001127       
                                          
            Sensitivity : 0.5617          
            Specificity : 0.8652          
         Pos Pred Value : 0.7459          
         Neg Pred Value : 0.7370          
             Prevalence : 0.4133          
         Detection Rate : 0.2321          
   Detection Prevalence : 0.3112          
      Balanced Accuracy : 0.7135          
                                          
       'Positive' Class : 0               
                                          

Folkmedicine

Code
#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Folkmedicine`,Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

inTraining <- createDataPartition(rfdata$Folkmedicine, p = .8, list = FALSE)
training <- rfdata[ inTraining,]
testing  <- rfdata[-inTraining,]

fitControl <- trainControl(## 10-fold CV
                           method = "repeatedcv",
                           number = 10,
                           ## repeated ten times
                           repeats = 10)


cl <- makePSOCKcluster(10)
registerDoParallel(cl)
set.seed(825)
rf_fit <- train(Folkmedicine ~ ., data = training, 
                 method = "rf", 
                 trControl = fitControl,
                 ## This last option is actually one
                 ## for gbm() that passes through
                 verbose = FALSE)

stopCluster(cl)
varImp(rf_fit)
rf variable importance

  only 20 most important variables shown (out of 39)

                                   Overall
Age                                 100.00
Duration.of.Residency                90.04
ReligionProtestant                   23.50
Discrimination                       22.69
EthnicityKorean                      20.57
GenderMale                           20.00
Income_medianAbove                   19.77
Dental.InsuranceYes                  17.49
EnglishSpeakVery well                17.05
EmploymentEmployed full time         17.03
Familiarity.with.AmericaHigh         16.86
Familiarity.with.Ethnic.OriginHigh   16.65
EnglishSpeakWell                     16.18
EnglishDiffNot much                  15.64
Primary.Language1                    15.41
Identify.EthnicallyVery close        15.26
BelongingSomewhat                    15.22
Identify.EthnicallySomewhat close    15.03
EnglishDiffMuch                      14.76
BelongingNot very much               14.17
Code
predict(rf_fit, newdata=testing) -> pc_pred

confusionMatrix(data=pc_pred,reference=testing$Folkmedicine)
Confusion Matrix and Statistics

          Reference
Prediction   0 Yes
       0   336  53
       Yes   0   0
                                          
               Accuracy : 0.8638          
                 95% CI : (0.8256, 0.8962)
    No Information Rate : 0.8638          
    P-Value [Acc > NIR] : 0.5365          
                                          
                  Kappa : 0               
                                          
 Mcnemar's Test P-Value : 9.148e-13       
                                          
            Sensitivity : 1.0000          
            Specificity : 0.0000          
         Pos Pred Value : 0.8638          
         Neg Pred Value :    NaN          
             Prevalence : 0.8638          
         Detection Rate : 0.8638          
   Detection Prevalence : 1.0000          
      Balanced Accuracy : 0.5000          
                                          
       'Positive' Class : 0